if(!requireNamespace("MSTExplorer", quietly = TRUE)){
  remotes::install_github("neurogenomics/MSTExplorer", dependencies = TRUE)
}
if(!requireNamespace("echodata", quietly = TRUE)){
  remotes::install_github("RajLabMSSM/echodata", dependencies = TRUE)
}

knitr::opts_chunk$set(warning = FALSE,
                      cache = TRUE,
                      message = FALSE) 

Import data

Import CSL areas of interest

CSL areas of interest mapped onto Human Phenotype Ontology (HPO) and/or other disease ontologies (OMIM/ORPH).

csl <- data.table::fread(here::here("data/CSL_areas_of_interest.tsv"))
csl <- csl[Group=="Early Stage Partnering"]
# extract IDs from the Entry column of csl data.table with the pattern "HP:", "OMIM:", "ORPHA:" and put into a new col
csl[, ID := stringr::str_extract(Entry, "HP:\\d+|OMIM:\\d+|ORPHA:\\d+")]
# extract names from Entry column WITHOUT the ID
csl[, Name := trimws(stringr::str_remove(Entry, "HP:\\d+|OMIM:\\d+|ORPHA:\\d+"))]

MSTExplorer::create_dt(csl)

Extract IDs of phenotypes and diseases. Get any descendants of manually mapped HPO IDs as well.

hpo_ids <- csl[grepl("HP:",ID)]$ID |> unique()
disease_ids <- csl[!grepl("HP:",ID)]$ID |> unique()
hpo <- KGExplorer::get_ontology("hp")
hpo_ids_list <- lapply(stats::setNames(hpo_ids,hpo_ids),
                       function(x) simona::dag_offspring(dag = hpo, 
                                                         include_self = TRUE,
                                                         term = x)) 
hpo_ids_extended <- unlist(hpo_ids_list) |> unique() 

Import phenotype-cell type association results

results <- MSTExplorer::load_example_results()
results <- HPOExplorer::add_hpo_name(results, hpo = hpo)
results <- HPOExplorer::add_ont_lvl(results)
results <- HPOExplorer::add_ancestor(results, hpo = hpo)
results <- MSTExplorer::map_celltype(results)
MSTExplorer::add_logfc(results)
results[,effect:=estimate]

p2g <- HPOExplorer::load_phenotype_to_genes()

Prioritise targets

prioritise_targets_out <- MSTExplorer::prioritise_targets(
  results = results, 
  hpo = hpo,
  phenotype_to_genes = p2g,
  
  keep_deaths=NULL,
  keep_onsets=NULL,
  keep_specificity_quantiles = seq(30,40), ## NULL:70, 30-40:64 
  keep_mean_exp_quantiles = seq(30,40), ## NULL:65, 10:55
  info_content_threshold=8, ## 8:55, 5:64 
  effect_threshold=NULL, ## 1:39
  severity_score_gpt_threshold=NULL, ## 10:78, NULL:82
  symptom_intersection_threshold=.25, ## .25:57
  evidence_score_threshold=3,  ## 5:47, 4:47, 3:64
  top_n = 10, ## 5:38, 20:42, 30:45, 40:52, 50:55
  group_vars = "hpo_id")
 
all_targets <- prioritise_targets_out$top_targets
targets <- all_targets[
  (hpo_id %in% hpo_ids_extended 
   | disease_id %in% disease_ids
   ) 
]
MSTExplorer::create_dt(head(targets,100))

Summarise results:

message(paste0(
  length(intersect(all_targets$hpo_id,hpo_ids_extended)),"/",
        length(unique(hpo_ids_extended)),
        " (",round(length(intersect(hpo_ids_extended,all_targets$hpo_id))/
                  length(unique(hpo_ids_extended))*100,2),"%)",
        " CSL phenotypes (across ",length(unique(all_targets$disease_id))," diseases)",
  " covered in our prioritised targets."
))
## 594/1860 (31.94%) CSL phenotypes (across 4850 diseases) covered in our prioritised targets.

Get the top candidates per area of interest

Per HPO ancestor

top_targets_ancestor <- targets[!duplicated(ancestor_name),] 

Per CSL area of interest

csl_areas <- unique(csl[,c("ID","Area","Subarea")])
targets2 <- targets |>
  merge(csl_areas,
        all.x=TRUE,
        by.x="hpo_id",by.y="ID") |>
  merge(csl_areas,
        all.x=TRUE,
        by.x="disease_id",by.y="ID")
targets2[,Area:=data.table::fcoalesce(Area.x,Area.y)]
targets2[,Subarea:=data.table::fcoalesce(Subarea.x,Subarea.y)]
#### Select top N targets per CSL Area ####
num_cols <- sapply(targets2,is.numeric)
top_targets <- targets2[!is.na(Area), head(.SD,3), 
                               by=c("Area")]
# drop list columns
# save_path <- here::here("reports/top_targets_CSL.tsv")
# top_targets[,-names(top_targets)[sapply(top_targets,is.list)],with=FALSE] |>
#   data.table::fwrite(save_path, sep="\t")
# top_targets <- data.table::fread(here::here("top_targets_CSL.tsv"))

MSTExplorer::create_dt(top_targets)
target_counts <- targets2[,list(n_targets=.N, 
                                n_genes=data.table::uniqueN(gene_symbol),
                                n_phenotypes=data.table::uniqueN(hpo_id),
                                n_diseases=data.table::uniqueN(disease_id)
              ),by=c("Area","Subarea")][!is.na(Area)]|>
  data.table::setorderv("n_targets",-1)

MSTExplorer::create_dt(target_counts)

Explore top targets

Stroke

Check for other results with the same cell type and gene target.

stroke_targets <- targets2[grepl("stroke",hpo_name, ignore.case = TRUE)]
stroke_targets <- stroke_targets[gene_symbol=="COL4A1"]

ClinVar

gnomad

https://gnomad.broadinstitute.org/gene/ENSG00000187498?dataset=gnomad_r4

COL4A1 exonn positions: https://www.ensembl.org/Homo_sapiens/Transcript/Exons?db=core;g=ENSG00000187498;r=13:110213499-110214499;t=ENST00000375820;v=rs34004222;vdb=variation;vf=813354076

Exon 3: chr13:110213926-110214015

#### Import ####
gnomad <- data.table::fread(here::here("data/gnomAD_v4.0.0_ENSG00000187498_2024_02_08_11_13_20.csv"),
                             check.names = TRUE)
#### Add exon information ####
db <- TxDb.Hsapiens.UCSC.hg38.knownGene::TxDb.Hsapiens.UCSC.hg38.knownGene
ex.gn <- GenomicFeatures::exonsBy(db, "gene")
ex <- ex.gn[["1282"]]
ex$exon_width <- GenomicRanges::width(ex)
# txs <- GenomicFeatures::transcriptsBy(db, by = "gene")
# tx.gn <- GenomicFeatures::exonsBy(TxDb.Hsapiens.UCSC.hg38.knownGene::TxDb.Hsapiens.UCSC.hg38.knownGene, by = "tx")
# tx <- tx.gn[["1282"]]
#### Convert to GRanges ####
gn <- echodata::dt_to_granges(gnomad,
                              chrom_col = "Chromosome", 
                              start_col = "Position",
                              style="UCSC") |>
  ## Merge gnomad with exon data
  IRanges::mergeByOverlaps(ex)
gn$ex=NULL
gn$`echodata::dt_to_granges(gnomad, chrom_col = "Chromosome", start_col = "Position", `=NULL
gn <- data.table::as.data.table(gn)
gn[,exon_variants_n:=data.table::uniqueN(gnomAD.ID),by="exon_id"]
gn[,exon_variants_perbp:=data.table::uniqueN(gnomAD.ID)/exon_width,by="exon_id"]
gn[,exon_pathovariants_perbp:=data.table::uniqueN(
  gnomAD.ID[grepl("pathogenic$",ClinVar.Clinical.Significance,ignore.case = TRUE)|cadd>=20])/exon_width,by="exon_id"]
gn[,exon_pathovariants_cumfreq:=sum(
  Allele.Frequency[grepl("pathogenic$",ClinVar.Clinical.Significance,ignore.case = TRUE)|cadd>=20]),by="exon_id"]

gn_top <-gn[exon_pathovariants_cumfreq %in% head(data.table::fsort(unique(exon_pathovariants_cumfreq),decreasing=TRUE),2)][
  grepl("pathogenic$",ClinVar.Clinical.Significance,ignore.case = TRUE)|cadd>=20,
]
gn_top[,head(.SD,1),by="exon_id",
       .SDcols = c("exon_width",
                   "exon_variants_n",
                   "exon_variants_perbp",
                   "exon_pathovariants_perbp",
                   "exon_pathovariants_cumfreq")] |>
  MSTExplorer::create_dt()

Session info

sessionInfo()
## R version 4.3.1 (2023-06-16)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Sonoma 14.4
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/London
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##   [1] ProtGenerics_1.34.0                     
##   [2] fs_1.6.3                                
##   [3] matrixStats_1.2.0                       
##   [4] bitops_1.0-7                            
##   [5] EnsDb.Hsapiens.v75_2.99.0               
##   [6] httr_1.4.7                              
##   [7] RColorBrewer_1.1-3                      
##   [8] doParallel_1.0.17                       
##   [9] tools_4.3.1                             
##  [10] backports_1.4.1                         
##  [11] utf8_1.2.4                              
##  [12] R6_2.5.1                                
##  [13] DT_0.32                                 
##  [14] lazyeval_0.2.2                          
##  [15] GetoptLong_1.0.5                        
##  [16] prettyunits_1.2.0                       
##  [17] cli_3.6.2                               
##  [18] Biobase_2.62.0                          
##  [19] sass_0.4.8                              
##  [20] readr_2.1.5                             
##  [21] ewceData_1.10.0                         
##  [22] Rsamtools_2.18.0                        
##  [23] yulab.utils_0.1.4                       
##  [24] R.utils_2.12.3                          
##  [25] dichromat_2.0-0.1                       
##  [26] orthogene_1.9.1                         
##  [27] maps_3.4.2                              
##  [28] limma_3.58.1                            
##  [29] rstudioapi_0.15.0                       
##  [30] RSQLite_2.3.5                           
##  [31] pals_1.9                                
##  [32] generics_0.1.3                          
##  [33] gridGraphics_0.5-1                      
##  [34] shape_1.4.6.1                           
##  [35] BiocIO_1.12.0                           
##  [36] crosstalk_1.2.1                         
##  [37] gtools_3.9.5                            
##  [38] car_3.1-2                               
##  [39] dplyr_1.1.4                             
##  [40] zip_2.3.1                               
##  [41] homologene_1.4.68.19.3.27               
##  [42] Matrix_1.6-5                            
##  [43] fansi_1.0.6                             
##  [44] S4Vectors_0.40.2                        
##  [45] abind_1.4-5                             
##  [46] R.methodsS3_1.8.2                       
##  [47] lifecycle_1.0.4                         
##  [48] scatterplot3d_0.3-44                    
##  [49] yaml_2.3.8                              
##  [50] carData_3.0-5                           
##  [51] SummarizedExperiment_1.32.0             
##  [52] gplots_3.1.3.1                          
##  [53] SparseArray_1.2.4                       
##  [54] BiocFileCache_2.10.1                    
##  [55] grid_4.3.1                              
##  [56] blob_1.2.4                              
##  [57] promises_1.2.1                          
##  [58] ExperimentHub_2.10.0                    
##  [59] crayon_1.5.2                            
##  [60] lattice_0.22-5                          
##  [61] GenomicFeatures_1.54.4                  
##  [62] chromote_0.2.0                          
##  [63] KEGGREST_1.42.0                         
##  [64] mapproj_1.2.11                          
##  [65] pillar_1.9.0                            
##  [66] knitr_1.45                              
##  [67] ComplexHeatmap_2.18.0                   
##  [68] KGExplorer_0.99.0                       
##  [69] GenomicRanges_1.54.1                    
##  [70] rjson_0.2.21                            
##  [71] codetools_0.2-19                        
##  [72] glue_1.7.0                              
##  [73] ggfun_0.1.4                             
##  [74] data.table_1.15.2                       
##  [75] vctrs_0.6.5                             
##  [76] png_0.1-8                               
##  [77] treeio_1.26.0                           
##  [78] gtable_0.3.4                            
##  [79] HPOExplorer_1.0.0                       
##  [80] cachem_1.0.8                            
##  [81] xfun_0.42                               
##  [82] openxlsx_4.2.5.2                        
##  [83] S4Arrays_1.2.1                          
##  [84] mime_0.12                               
##  [85] tidygraph_1.3.1                         
##  [86] SingleCellExperiment_1.24.0             
##  [87] RNOmni_1.0.1.2                          
##  [88] iterators_1.0.14                        
##  [89] simona_1.0.10                           
##  [90] statmod_1.5.0                           
##  [91] interactiveDisplayBase_1.40.0           
##  [92] ellipsis_0.3.2                          
##  [93] nlme_3.1-164                            
##  [94] ggtree_3.10.1                           
##  [95] EWCE_1.11.3                             
##  [96] bit64_4.0.5                             
##  [97] progress_1.2.3                          
##  [98] filelock_1.0.3                          
##  [99] rprojroot_2.0.4                         
## [100] GenomeInfoDb_1.38.7                     
## [101] bslib_0.6.1                             
## [102] KernSmooth_2.23-22                      
## [103] colorspace_2.1-0                        
## [104] BiocGenerics_0.48.1                     
## [105] DBI_1.2.2                               
## [106] tidyselect_1.2.1                        
## [107] processx_3.8.3                          
## [108] bit_4.0.5                               
## [109] compiler_4.3.1                          
## [110] curl_5.2.1                              
## [111] rvest_1.0.4                             
## [112] httr2_1.0.0                             
## [113] xml2_1.3.6                              
## [114] DelayedArray_0.28.0                     
## [115] plotly_4.10.4                           
## [116] rtracklayer_1.62.0                      
## [117] scales_1.3.0                            
## [118] caTools_1.18.2                          
## [119] rappdirs_0.3.3                          
## [120] stringr_1.5.1                           
## [121] digest_0.6.35                           
## [122] piggyback_0.1.5                         
## [123] rmarkdown_2.26                          
## [124] XVector_0.42.0                          
## [125] htmltools_0.5.7                         
## [126] pkgconfig_2.0.3                         
## [127] GeneOverlap_1.38.0                      
## [128] MatrixGenerics_1.14.0                   
## [129] echodata_0.99.17                        
## [130] dbplyr_2.4.0                            
## [131] fastmap_1.1.1                           
## [132] ensembldb_2.26.0                        
## [133] rlang_1.1.3                             
## [134] GlobalOptions_0.1.2                     
## [135] htmlwidgets_1.6.4                       
## [136] shiny_1.8.0                             
## [137] jquerylib_0.1.4                         
## [138] jsonlite_1.8.8                          
## [139] BiocParallel_1.36.0                     
## [140] R.oo_1.26.0                             
## [141] RCurl_1.98-1.14                         
## [142] magrittr_2.0.3                          
## [143] GenomeInfoDbData_1.2.11                 
## [144] ggplotify_0.1.2                         
## [145] patchwork_1.2.0                         
## [146] munsell_0.5.0                           
## [147] Rcpp_1.0.12                             
## [148] ape_5.7-1                               
## [149] babelgene_22.9                          
## [150] stringi_1.8.3                           
## [151] zlibbioc_1.48.0                         
## [152] AnnotationHub_3.10.0                    
## [153] plyr_1.8.9                              
## [154] parallel_4.3.1                          
## [155] Biostrings_2.70.2                       
## [156] hms_1.1.3                               
## [157] circlize_0.4.16                         
## [158] ps_1.7.6                                
## [159] igraph_2.0.3                            
## [160] ggpubr_0.6.0                            
## [161] ggsignif_0.6.4                          
## [162] reshape2_1.4.4                          
## [163] biomaRt_2.58.2                          
## [164] stats4_4.3.1                            
## [165] gprofiler2_0.2.3                        
## [166] BiocVersion_3.18.1                      
## [167] XML_3.99-0.16.1                         
## [168] evaluate_0.23                           
## [169] BiocManager_1.30.22                     
## [170] tzdb_0.4.0                              
## [171] foreach_1.5.2                           
## [172] httpuv_1.6.14                           
## [173] rols_2.30.2                             
## [174] grr_0.9.5                               
## [175] tidyr_1.3.1                             
## [176] purrr_1.0.2                             
## [177] clue_0.3-65                             
## [178] ggplot2_3.5.0                           
## [179] broom_1.0.5                             
## [180] xtable_1.8-4                            
## [181] restfulr_0.0.15                         
## [182] AnnotationFilter_1.26.0                 
## [183] tidytree_0.4.6                          
## [184] rstatix_0.7.2                           
## [185] later_1.3.2                             
## [186] viridisLite_0.4.2                       
## [187] MSTExplorer_1.0.0                       
## [188] TxDb.Hsapiens.UCSC.hg38.knownGene_3.18.0
## [189] Polychrome_1.5.1                        
## [190] tibble_3.2.1                            
## [191] websocket_1.4.1                         
## [192] aplot_0.2.2                             
## [193] memoise_2.0.1.9000                      
## [194] AnnotationDbi_1.64.1                    
## [195] GenomicAlignments_1.38.2                
## [196] IRanges_2.36.0                          
## [197] cluster_2.1.6                           
## [198] HGNChelper_0.8.1                        
## [199] here_1.0.1